I import some standard python data analysis packages and do authentication on Google Drive
%%javascript
IPython.OutputArea.auto_scroll_threshold = 9999;
from __future__ import print_function
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from google.colab import auth
from google.cloud import bigquery
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
auth.authenticate_user()
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.preprocessing import StandardScaler,MinMaxScaler
import warnings
warnings.filterwarnings('ignore')
from sklearn.cluster import KMeans
Next I will need to enter some information on how to access the data.
analysis_project is the project used for processing the queries.
#@title Fill out this form then press [shift ⇧]+[enter ⏎] {run: "auto"}
import subprocess
import re
analysis_project = 'physionet-data-275415' #@param {type:"string"}
admissions_table = 'physionet-data.mimiciii_clinical.admissions' # @param {type: "string"}
patients_table = 'physionet-data.mimiciii_clinical.patients' # @param {type: "string"}
labs_event = 'physionet-data.mimiciii_clinical.labevents' # @param {type: "string"}
labs_items = 'physionet-data.mimiciii_clinical.d_labitems' # @param {type: "string"}
# Preprocess queries made with the %%bigquery magic
# by substituting these values
sub_dict = {
'analysis_project': analysis_project,
'admissions_table': admissions_table,
'patients_table': patients_table,
'ml_table_prefix': analysis_project + '.MIMIC.models_',
'labs_event':labs_event,
'labs_items':labs_items
}
# Set the default project for running queries
bigquery.magics.context.project = analysis_project
# Set up the substitution preprocessing injection
# if bigquery.magics._run_query.func_name != 'format_and_run_query':
# original_run_query = bigquery.magics._run_query
original_run_query = bigquery.magics._run_query
def format_and_run_query(client, query, job_config=None):
query = query.format(**sub_dict)
return original_run_query(client, query, job_config)
bigquery.magics._run_query = format_and_run_query
print('analysis_project:', analysis_project)
print()
print('custom %%bigquery magic substitutions:')
for k, v in sub_dict.items():
print(' ', '{%s}' % k, '→', v)
%config InlineBackend.figure_format = 'svg'
bq = bigquery.Client(project=analysis_project)
Create data set named MIMIC
if 'MIMIC' not in [d.dataset_id for d in list(bq.list_datasets())]:
dataset_id = "{}.MIMIC".format(bq.project)
dataset = bigquery.Dataset(dataset_id)
dataset = bq.create_dataset(dataset)
In this part I want to get necessary data from MIMIC using BigQuery
First of all I got list of Top 40 common labs
%%bigquery top_labs
SELECT ITEMID,COUNT,LABEL
FROM
(SELECT ITEMID,COUNT(*) AS COUNT
FROM `{labs_event}`
GROUP BY (ITEMID))
JOIN `{labs_items}`
USING(ITEMID)
ORDER BY COUNT DESC
LIMIT 40
top_labs.head()
After this, I wrote query that returned to me DataFrame with requered conditions, namely lab results from the first hour of ICU admission, as well as demographic data including gender, age, admission type, admission location, insurance, marital status, ethnicity.
%%bigquery df_to_cluster
WITH top_labs AS (
SELECT ITEMID,LABEL
FROM
(SELECT ITEMID,COUNT(*) AS COUNT
FROM `{labs_event}`
GROUP BY (ITEMID))
JOIN `{labs_items}`
USING(ITEMID)
ORDER BY COUNT DESC
LIMIT 40
)
SELECT *EXCEPT(TIME)
FROM
(SELECT SUBJECT_ID,HADM_ID,ADMITTIME,ADMISSION_TYPE,ADMISSION_LOCATION,INSURANCE,MARITAL_STATUS,ETHNICITY,GENDER,
IF(DATETIME_DIFF(ADMITTIME, DOB, DAY)/365.25 < 200,DATETIME_DIFF(ADMITTIME, DOB, DAY)/365.25, 95) AS AGE,ITEMID,CHARTTIME,VALUENUM,
IF(DATETIME_DIFF(CHARTTIME, ADMITTIME, HOUR)<= 1,1,0) AS TIME,HOSPITAL_EXPIRE_FLAG as died,LABEL
FROM
((`{admissions_table}` JOIN `{patients_table}`USING (SUBJECT_ID))
JOIN
(SELECT SUBJECT_ID,HADM_ID,ITEMID,CHARTTIME,VALUENUM
FROM`{labs_event}`)
USING (SUBJECT_ID,HADM_ID))
JOIN
top_labs
USING (ITEMID))
WHERE TIME=1
Excess information will be deleted later
df_to_cluster.head(8)
I want to see if there are cases of repeating the same lab tests in the same HADM ID
repeated=df_to_cluster.groupby(['HADM_ID','LABEL']).count().reset_index()[['HADM_ID','LABEL','SUBJECT_ID']]
repeated.columns=['HADM_ID','LABEL','Count']
repeated=repeated[repeated['Count']>1].sort_values('Count',ascending=False)
repeated=repeated.groupby(['LABEL','Count']).count().reset_index()
repeated.columns=['Test','Count of repeated tests','Count of admissions']
repeated
plt.figure(figsize=(15,7))
sns.lineplot(x="Count of repeated tests", y="Count of admissions",
hue="Test", data=repeated)
plt.title("Distribution repeated lab tests")
plt.xlabel("Count of repeated tests")
plt.ylabel("Count of admissions")
I decided to take average of repeated tests and drop duplicated rows
df_to_cluster_new=df_to_cluster.groupby(['HADM_ID','LABEL'])['VALUENUM'].mean().reset_index()
df_to_cluster=df_to_cluster.drop('VALUENUM',axis=1)
df_to_cluster_new=df_to_cluster_new.merge(df_to_cluster,on=['HADM_ID','LABEL'])
df_to_cluster_new=df_to_cluster_new.drop_duplicates()
df_to_cluster_new.head(8)
Now DataFrame is ready to apply pivot_table and transform column LABEL
df_to_clust=df_to_cluster_new.pivot_table(index=['HADM_ID','ADMISSION_TYPE','ADMISSION_LOCATION','INSURANCE','MARITAL_STATUS','ETHNICITY','GENDER','AGE','died'], columns='LABEL', values='VALUENUM').reset_index()
df_to_clust.head(8)
Distribution in categorical columns
I wanted to see what values categorical columns take
mask_all_df =df_to_clust.dtypes == object
all_df_features=df_to_clust.columns[mask_all_df].tolist()
it=1
for i in all_df_features:
print('{}) Column:{}\nCount uniques:{}\nNamely: '.format(it,i,len(df_to_clust[i].unique())))
for ii in df_to_clust[i].unique():
print(' -{}'.format(ii))
print('\n')
it+=1
In this case, I think it is advisable to use the encoder only for the 'GENDER' column.
I’ll use OneHot Encoder for other columns, as there are too many values and I don’t have enough information to correctly assign weights.
I’ll do an additional preprocessing to column ethnicity to reduce the variance of the values.
df_to_clust['ETHNICITY']=df_to_clust['ETHNICITY'].replace(['UNKNOWN/NOT SPECIFIED', 'UNABLE TO OBTAIN','PATIENT DECLINED TO ANSWER'], 'OTHER')
df_to_clust['ETHNICITY']=df_to_clust['ETHNICITY'].replace(['BLACK/AFRICAN AMERICAN'], 'BLACK/AFRICAN')
df_to_clust['ETHNICITY']=df_to_clust['ETHNICITY'].replace(['HISPANIC/LATINO - GUATEMALAN','HISPANIC/LATINO - SALVADORAN','HISPANIC/LATINO - PUERTO RICAN','HISPANIC/LATINO - DOMINICAN','HISPANIC/LATINO - CUBAN',
'HISPANIC/LATINO - HONDURAN','HISPANIC/LATINO - MEXICAN','HISPANIC/LATINO - CENTRAL AMERICAN (OTHER)',
'HISPANIC/LATINO - COLOMBIAN','HISPANIC OR LATINO'], 'HISPANIC/LATINO')
df_to_clust['ETHNICITY']=df_to_clust['ETHNICITY'].replace(['ASIAN - CHINESE','ASIAN - ASIAN INDIAN','ASIAN - OTHER','ASIAN - FILIPINO','ASIAN - VIETNAMESE',
'ASIAN - CAMBODIAN','ASIAN - KOREAN','ASIAN - THAI','ASIAN - JAPANESE'], 'ASIAN')
df_to_clust['ETHNICITY']=df_to_clust['ETHNICITY'].replace(['WHITE - RUSSIAN','WHITE - OTHER EUROPEAN','WHITE - EASTERN EUROPEAN'], 'WHITE')
df_to_clust['ETHNICITY'].unique()
df_died=df_to_clust[['died']]
df_to_clust=df_to_clust.drop(['HADM_ID','died'],axis=1)
num_columns_mask=df_to_clust.dtypes != object
num_columns = df_to_clust.columns[num_columns_mask].tolist()
df_to_clust['GENDER']=df_to_clust['GENDER'].map({'F':0,'M':1})
df_gender=df_to_clust[['GENDER']]
columns2dummies=['ETHNICITY','MARITAL_STATUS','INSURANCE','ADMISSION_LOCATION','ADMISSION_TYPE']
df_categ=pd.get_dummies(df_to_clust[columns2dummies], prefix='Dummies')
df_numer=df_to_clust[num_columns]
Next, for the numeric columns I apply MinMax Scaler from SkLearn to achieve better results.
scaler = MinMaxScaler()
df_numer[df_numer.columns]=scaler.fit_transform(df_numer)
df_numer.head(8)
Now all values take values from 0 to 1
df_numer.describe()
Collect all parts of the data frame into one
final_df=pd.concat([df_died, df_gender,df_numer,df_categ],axis=1)
final_df.head(8)
And replace Nan to '-1'
final_df=final_df.fillna(-1)
final_df.head(8)
Finnaly step for clustering is to drop column 'died'
df4cluster=final_df.drop('died',axis=1)
target=final_df['died']
First of all I want to see death probability and death distribution
print('Numbers of class 0 (not died):{}\nNumbers of class 1 (died):{}\nDeath probability:{:.3f}%'.format(target.value_counts()[0],target.value_counts()[1],100*target.value_counts()[1]/target.value_counts()[0]))
target.hist()
Before clustering, visualize the data using TSNE
tsne = TSNE(learning_rate=50)
tsne_features = tsne.fit_transform(df4cluster)
df_subset=pd.DataFrame()
df_subset['tsne-2d-one'] = tsne_features[:,0]
df_subset['tsne-2d-two'] = tsne_features[:,1]
df_subset['y'] = target
flatui = ["#efe6f2", "#fd1336"]
plt.figure(figsize=(12,8))
sns.scatterplot(
x="tsne-2d-one", y="tsne-2d-two",
hue="y",
palette=sns.color_palette(flatui),
data=df_subset,
legend="full",
alpha=0.3
)
I see that the data is very mixed
pca = PCA(n_components=2)
pca_result = pca.fit_transform(df4cluster)
df_subset=pd.DataFrame()
df_subset['pca-one'] = pca_result[:,0]
df_subset['pca-two'] = pca_result[:,1]
df_subset['y'] = target
plt.figure(figsize=(12,8))
sns.scatterplot(
x="pca-one", y="pca-two",
hue="y",
palette=sns.color_palette(flatui),
data=df_subset,
legend="full",
alpha=0.3
)
Using PCA, I see two clusters, but the death targets in them is still scattered
Lets fit k-means model with 4 clusters
model = KMeans(n_clusters=4)
labels = model.fit_predict(df4cluster)
df = pd.DataFrame({'labels': labels, 'targets': target})
ct = pd.crosstab(df['labels'],df['targets'])
ct=ct.reset_index()
ct.columns=['Cluster','Class 0','Class 1']
ct['Probability']=100*ct['Class 1']/ct['Class 0']
sns.barplot(y='Probability', x='Cluster',data=ct)
plt.ylabel("Death Probability")
plt.show()
display(ct)
Distribution into 4 clusters allowed us to single out the cluster with the highest mortality rate - 21% versus 12% without clustering. Two clusters with lower mortality of 5 and 10 percent are also distinguished, the probability of death in them is also lower than without clustering.
In this case, clustering allows us to improve the prediction of the probability of death.
First of all I used dimensional reduction PCA with 2 components and applied after this clustering model
pca = PCA(n_components=2)
pca_result = pca.fit_transform(df4cluster)
model = KMeans(n_clusters=4)
labels = model.fit_predict(pca_result)
df = pd.DataFrame({'labels': labels, 'targets': target})
ct = pd.crosstab(df['labels'],df['targets'])
ct=ct.reset_index()
ct.columns=['Cluster','Class 0','Class 1']
ct['Probability']=100*ct['Class 1']/ct['Class 0']
sns.barplot(y='Probability', x='Cluster',data=ct)
plt.ylabel("Death Probability")
plt.show()
display(ct)
Using the pca to optimize the clustering, it seems to me, allowed us to get improvements. Namely, we were able to slightly increase the probability of death in the cluster with the highest rate, and in the other three clusters, on the contrary, reduce it. Thus, we have improved our separation ability.
After this I wrote a function that will allow me to compare different conditions
I can set the number of laboratory tests and also use categorical features or not
The exception is gender. It is present in all cases.
def fast_query(lab=20,cat=True,clust=4):
labs=top_labs['LABEL'].tolist()[:lab]
df_to_cluster_1=df_to_cluster_new[df_to_cluster_new['LABEL'].isin(labs)]
df_to_cluster_1=df_to_cluster_1.pivot_table(index=['HADM_ID','ADMISSION_TYPE','ADMISSION_LOCATION','INSURANCE','MARITAL_STATUS','ETHNICITY','GENDER','AGE','died'], columns='LABEL', values='VALUENUM').reset_index()
df_to_cluster_1['ETHNICITY']=df_to_cluster_1['ETHNICITY'].replace(['UNKNOWN/NOT SPECIFIED', 'UNABLE TO OBTAIN','PATIENT DECLINED TO ANSWER'], 'OTHER')
df_to_cluster_1['ETHNICITY']=df_to_cluster_1['ETHNICITY'].replace(['BLACK/AFRICAN AMERICAN'], 'BLACK/AFRICAN')
df_to_cluster_1['ETHNICITY']=df_to_cluster_1['ETHNICITY'].replace(['HISPANIC/LATINO - GUATEMALAN','HISPANIC/LATINO - SALVADORAN','HISPANIC/LATINO - PUERTO RICAN','HISPANIC/LATINO - DOMINICAN','HISPANIC/LATINO - CUBAN',
'HISPANIC/LATINO - HONDURAN','HISPANIC/LATINO - MEXICAN','HISPANIC/LATINO - CENTRAL AMERICAN (OTHER)',
'HISPANIC/LATINO - COLOMBIAN','HISPANIC OR LATINO'], 'HISPANIC/LATINO')
df_to_cluster_1['ETHNICITY']=df_to_cluster_1['ETHNICITY'].replace(['ASIAN - CHINESE','ASIAN - ASIAN INDIAN','ASIAN - OTHER','ASIAN - FILIPINO','ASIAN - VIETNAMESE',
'ASIAN - CAMBODIAN','ASIAN - KOREAN','ASIAN - THAI','ASIAN - JAPANESE'], 'ASIAN')
df_to_cluster_1['ETHNICITY']=df_to_cluster_1['ETHNICITY'].replace(['WHITE - RUSSIAN','WHITE - OTHER EUROPEAN','WHITE - EASTERN EUROPEAN'], 'WHITE')
df_died=df_to_cluster_1[['died']]
df_to_cluster_1=df_to_cluster_1.drop(['HADM_ID','died'],axis=1)
num_columns_mask=df_to_cluster_1.dtypes != object
num_columns = df_to_cluster_1.columns[num_columns_mask].tolist()
df_to_cluster_1['GENDER']=df_to_cluster_1['GENDER'].map({'F':0,'M':1})
df_gender=df_to_cluster_1[['GENDER']]
columns2dummies=['ETHNICITY','MARITAL_STATUS','INSURANCE','ADMISSION_LOCATION','ADMISSION_TYPE']
df_categ=pd.get_dummies(df_to_cluster_1[columns2dummies], prefix='Dummies')
df_numer=df_to_cluster_1[num_columns]
scaler = MinMaxScaler()
df_numer[df_numer.columns]=scaler.fit_transform(df_numer)
if cat==True:
final_df=pd.concat([df_died, df_gender,df_numer,df_categ],axis=1)
else:
final_df=pd.concat([df_died, df_gender,df_numer],axis=1)
final_df=final_df.fillna(-1)
df4cluster=final_df.drop('died',axis=1)
target=final_df['died']
model = KMeans(n_clusters=clust)
labels = model.fit_predict(df4cluster)
df = pd.DataFrame({'labels': labels, 'targets': target})
ct = pd.crosstab(df['labels'],df['targets'])
ct=ct.reset_index()
ct.columns=['Cluster','Class 0','Class 1']
ct['Probability']=100*ct['Class 1']/ct['Class 0']
if cat==True:
print('DataFrame with categorical features')
else:
print('DataFrame without categorical features')
display(df4cluster.head(3))
print('\nNumber clusters=',clust)
display(ct)
print('\n\n')
I want to consider the following cases
- Without categorical features - number of labs 40
- Without categorical features - number of labs 20
- Without categorical features - number of labs 10
- With categorical features - number of labs 20
- With categorical features - number of labs 10
For each case considered the number of clusters from 2 to 6
cases=[(False,40),(False,20),(False,10),(True,20),(True,10)]
clusters=[2,3,4,5,6]
for cat,labs in cases:
for cl in clusters:
print('Categorical features: {}'.format(cat))
print('Number lab tests: {}'.format(labs))
print('Number clusters: {}'.format(cl))
fast_query(cat=cat,lab=labs,clust=cl)
print('_____________________________________________________\n\n')
Having analyzed the data, it seems to me that the best clustering is achievement with 4 classes and PCA with 2 components
**In the course of work, I applied a clustering algorithm on data including categorical features such as gender, admission type, admission location, insurance, marital status, ethnicity and numeric features such as age and lab tests results
Using this data, it is really possible to improve the quality of the death prediction.
The most optimal was clustering using PCA and number cluster =2
With this approach, a cluster is formed with a high probability of death (22%), twice as much as in a model without the use of clustering (12%), and also two clusters with a low probability of death.
It is worth noting an interesting fact that I get the same division into clusters without using categorical features
It can be concluded that the main role is played by the results of lab tests. **